/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.examples;

import java.util.ArrayList;
import java.util.List;

import mosdi.discovery.EvaluatedPattern;
import mosdi.discovery.MatchCountSearch;
import mosdi.discovery.MotifFinder;
import mosdi.fa.Alphabet;
import mosdi.index.SuffixTree;
import mosdi.util.BitArray;
import mosdi.util.Iupac;
import mosdi.util.IupacStringConstraints;
import mosdi.util.Log;

public class SimpleMotifDiscovery {

	final static String sequence = 
		"ATTCACAAGCACAACGCATAAAAGGACGACCTGGCCTGCCAAGTGCAACGGCGAAGTTTTCGAACGTCGGTGCGGGGCCGTGTTGCCCGACTCATCATCA" +
		"GCTGGAACGACGCCCGCGGTCGTTTCACCGCAGGGGCGGCCATAGGATGTCAAGCCGGACACGATGTTTGCCCCGTAAAAGGATCCGACCGGGCCGAAGG" +
		"CATAAGAGGAAACGAGGTGGCCCGACGCCCTCGACGCGTCGCGCCGTCGTAAGAGGACCTCATCGCCGGTGGAAGTGCGCACAGACCTCCCGCGACAGTC" +
		"AACGCCCGGGCGGTCCGGGTGACTGTGGTGGTGAGGTCGTGACAGGTCGGCGGCTCCCGTTGAGCGCGCGTGGAGGTCACTCGCTGGCCTGGTCCGCCTC" +
		"CCCAACGCGGGCGGGGAACATCCACCTAAGAGGATTCGGCGGCTGCATTCGGCCGCCGACGGCCACCGTAGGGTCACCGTTCTGCTCCGGGACCGTTCGC" +
		"CGGGTACTGGTCGAAATATCACCGCTGGGTTTCGACCTGCGAGTAAGAGGACGGCGGCTCGGAACAAACCCGCCTGGATCGACGCCCCGGCCCCGCAGTT" +
		"CCTGAAGGTTGGTATAGTCTGCTAAGACGACTCCAGCATGAGCGGTGAGGACACCAGCCAGTAGCACCGAACAACGGTTGCATCGCCGGCCGCCCCGCGG" +
		"GGTTGGAGTGACCAATAGTCCGATGCGATCCGGTGGGCGGGATCGCCCGCATGCTATCTCAGGCTCTTCGCTGGCGTGCCACTTGGCCAAAGAGGTGGTG" +
		"GGCCATCCTTGGTTTCCCGGCTATCCGGAGATCCATGTGTCCTGCAAACCATAATTCATCACGTGCATCGTCGTCAGCCGGTGCGGCATTGTAAAACGAC" +
		"CAAACCGCGAAGTGGGGGGCCGCCCCCCGCGTTGACGTATGAAGGGTGCTGACATGGCATTTCTGAGCGCTTCGCGAACAACGCTTGATTGCCCCAGGGC" +
		"CGGGTGTGTGGGCTGCTCAAGGCGGCTTGAAGGCGGCCGCCGTAAGAGGACGGGTTGATCCGGCAGCGCCCGCGGCGACGTCGCCAGCGCGTCGTACAGC" +
		"TTGGGCGTGTTAAGAGGACCGGTACACCGCTCGAGAACGAATCGCTCTTTACGGCCGTGCTCCCTCCCTTTTACCTGGGCGGTCCGGCGAAACGGCGGCG" +
		"ACCAACAGTGCTGACGGCTCACGCGCCCAATCGATCGAGTATGGGCCACAAAGATATGTTCAGTAAGACGACAGTCGCAACGGGTGCCCGATCCTCGGCG" +
		"CGGTTCGCTAACATCGGTGGTGCCAGCCGCCACGTGGGCGGGATTGACCGCAGTTCAAAGCAGCAACTGGCCTGATCGCCGACTGCAGGCGAACGGATGC" +
		"TGCCGCGTCCACTGCAAGCATTAAAACGAGCTGGACGTGTGATAGCTGCCCAGCGACTCGCAGTCGTCGGCCATCGGGTTGCTCACGCCGCGCAATCATC" +
		"CCGTCCGCCGCGACCGACATCACGTCGGCCATAAAACGACCGATGAGGTGTTGACCAGTACGCATGCGCCGGGAATTGGCGCCTCAGTCTGCGGAGGCGT" +
		"GGCAGGACGCACCGCGGTGTTCCGGTTGCGATATTCCTTGACGAGCTGGCCGGTGAATAGGCGACGGGGCGCCGGCGCTTCTGGTAGGGCCGCGCCTGAA" +
		"GCAGGGATGGCCGGGCCCCAGCGGCGCTGCCTTGGCCGATCCACCCGGCGGGTGGTTAAAACGAAACATCCATGAGAAGACCCACACTGATGCCCTGGGA" +
		"ACCGGACCACGATCCTACCACCAGAGGGAAGTTTCCCGATGGGATTTTGACCCGGACGCGTCGCCTCCAGCATCTAAAACGATTGGATATTGAGAATGGA" +
		"ACTCCACGCTGGCTCCAGTAAAACGAGCTGCGGGCGTTGCGGCGCCTCGTGTTCCCGTCGAGCGACATTACCTAAAAGGACGCGGGGAACTCCGAAGGAT";

	public static void main(String[] args) {
		Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		List<String> sequences = new ArrayList<String>();
		sequences.add(sequence);
		// enable debugging output
		Log.setLogLevel(Log.Level.DEBUG);
		// we look for patterns of length 8 with at most 4 wildcards
		int patternLength = 8;
		int numberOfNs = 4;
		// build suffix tree without considerung the reverse complementary strand
		// do not build entire suffix tree but only those nodes necessary to find all
		// substrings of length <= patternLength
		SuffixTree suffixTree = SuffixTree.buildSuffixTree(dnaAlphabet, sequences, false, patternLength);
		// create an array that gives for each node the number of occurrences of 
		// the corresponding string.
		int[] occurrencesCountAnnotation = suffixTree.calcOccurrenceCountAnnotation();
		// alphabet of IUPAC characters (i.e. N stands for aNy, R for {A,G}, etc...)
		Alphabet iupacAlphabet = Alphabet.getIupacAlphabet();
		BitArray[] generalizedAlphabet = Iupac.asGeneralizedAlphabet();
		// create a motif finder, that is, a object capable of walker the suffix tree
		// and finding motifs according to a SearchSpecification
		MotifFinder motifFinder = new MotifFinder(suffixTree, generalizedAlphabet, false);
		// create a search specification that searches for all patterns that occur 
		// at least 40 times.
		MotifFinder.SearchSpecification search = new MatchCountSearch(40, occurrencesCountAnnotation, null);
		// these constraints tell how many wildcards are allowed, etc.
		IupacStringConstraints constraints = new IupacStringConstraints(patternLength, numberOfNs);
		motifFinder.findIupacPatterns(patternLength, constraints, search);
		// print results
		Log.println(Log.Level.STANDARD, "--------- RESULTS ---------");
		for (EvaluatedPattern p : search.getResults()) {
			Log.printf(Log.Level.STANDARD, "%s %d\n", iupacAlphabet.buildString(p.getPattern()), p.getScore());	
		}
	}

}
